NCORES = 2
registerDoParallel(NCORES)
register(DoparParam())
ncells = 1000
add = '_ziadd0.67'
fileName = sprintf('simLun_%s%s', ncells, add)
load(paste0(fileName, '.rda'))
counts = simData[[1]]$counts
counts = counts[rowSums(counts) != 0, ]

Methods

ZINB

zinb = zinbFit(counts, K = 2, commondispersion = FALSE)
W = getW(zinb)

PCA

library(rARPACK)
fastpca <- function(expr, K = 2, scale=FALSE) {
  svd_raw <- svds(scale(t(expr), center=TRUE, scale=scale), k=K, nu=K, nv=0)
  pc_raw <- svd_raw$u %*% diag(svd_raw$d[1:K])
  return(pc_raw)
}

totalcount = function (ei)
{
  sums = colSums(ei)
  eo = t(t(ei)*mean(sums)/sums)
  return(eo)
}

tc <- totalcount(counts)
tmm <- TMM_FN(counts)
fq <- FQT_FN(counts)

pca_raw <- fastpca(log1p(t(counts)))
pca_tc  <- fastpca(log1p(tc))
pca_tmm <- fastpca(log1p(tmm))
pca_fq  <- fastpca(t(log1p(fq)))

ZIFA

wrapRzifa <- function(Y, block = TRUE, k=2){
  # wrapper R function for ZIFA.
  # md5 hashing and temporary files are used not to re-run zifa 
  # if it has already be run on this computer.
  d = digest(Y, "md5")
  tmp = paste0(tempdir(), '/', d)
  write.csv(Y, paste0(tmp, '.csv'))
  
  if (!file.exists(paste0(tmp, "_", k, '_zifa.csv'))){
    print('run ZIFA')
    bb = ifelse(block, '-b ', '')
    cmd = sprintf('python ../run_zifa.py -d %d %s%s.csv %s_%d_zifa.csv', k, bb, tmp, tmp, k)
    system(cmd)
  }
  read.csv(sprintf("%s_%d_zifa.csv", tmp, k), header=FALSE)
}

zifa_raw <- wrapRzifa(log1p(counts))
## [1] "run ZIFA"
zifa_tc  <- wrapRzifa(log1p(tc))
## [1] "run ZIFA"
zifa_tmm <- wrapRzifa(log1p(tmm))
## [1] "run ZIFA"
zifa_fq  <- wrapRzifa(log1p(fq))
## [1] "run ZIFA"

Clustering

res <- list('ZINB-WaVE_K=2' = W,
            'PCA_RAW' = pca_raw, 'PCA_TC' = pca_tc,
            'PCA_TMM' = pca_tmm, 'PCA_FQ' = pca_fq,
            'ZIFA_RAW' = zifa_raw, 'ZIFA_TC' = zifa_tc,
            'ZIFA_TMM' = zifa_tmm, 'ZIFA_FQ' = zifa_fq)

cl <- lapply(1:length(res), function(i){
  km = kmeans(res[[i]], centers = 3, iter.max = 1000, nstart = 1000)
  estClust = km$cluster
  
  par(mfrow=c(1,2))
  plot(res[[i]], col = labels, main = paste(names(res)[i], '\nTrue Labels'))
  plot(res[[i]], col = estClust, main = paste(names(res)[i], '\nFound Labels'))
  par(mfrow=c(1,1))
  
  estClust
})

names(cl) = names(res)
lapply(cl, table, labels)
## $`ZINB-WaVE_K=2`
##    labels
##       1   2   3
##   1 333   0   0
##   2   0 333   0
##   3   0   0 334
## 
## $PCA_RAW
##    labels
##       1   2   3
##   1 140 152 149
##   2 123 106 117
##   3  70  75  68
## 
## $PCA_TC
##    labels
##       1   2   3
##   1  94 114  86
##   2 138 125 160
##   3 101  94  88
## 
## $PCA_TMM
##    labels
##       1   2   3
##   1 131 126 145
##   2 101  94  90
##   3 101 113  99
## 
## $PCA_FQ
##    labels
##       1   2   3
##   1 135 148 143
##   2 128 110 121
##   3  70  75  70
## 
## $ZIFA_RAW
##    labels
##       1   2   3
##   1 257   0   8
##   2  74 108 106
##   3   2 225 220
## 
## $ZIFA_TC
##    labels
##       1   2   3
##   1 111  53  15
##   2 221 278   4
##   3   1   2 315
## 
## $ZIFA_TMM
##    labels
##       1   2   3
##   1 322   7  30
##   2   6 324   1
##   3   5   2 303
## 
## $ZIFA_FQ
##    labels
##       1   2   3
##   1   0  16 288
##   2 198 118  34
##   3 135 199  12